Predict Russian Housing Market
Posted on Dim 23 septembre 2018 in Kaggle
Kaggle : Predict realty prices Sberbank Russian housing market¶
Predict realty prices with housing data and macroeconomic patterns.
1) Data Cleaning¶
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_profiling
import datetime
import warnings
warnings.filterwarnings('ignore')
#macro_cols = ["timestamp", "balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
#"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
#"income_per_cap", "rent_price_4+room_bus"]
macro_cols = ["timestamp", "eurrub", "micex_rgbi_tr", "micex_cbi_tr"]
df_train = pd.read_csv('./train.csv')
df_macro = pd.read_csv('./macro.csv', usecols= macro_cols)
df_test = pd.read_csv('./test.csv')
print("Train : ")
print(df_train.shape[0])
print(df_train.shape[1])
print("Macro : ")
print(df_macro.shape[0])
print(df_macro.shape[1])
print("Test : ")
print(df_test.shape[0])
print(df_test.shape[1])
Merge Macro¶
df_train = pd.merge_ordered(df_train, df_macro, on='timestamp', how='left')
df_test = pd.merge_ordered(df_test, df_macro, on='timestamp', how='left')
Select Columns with Null values¶
.isnull().any() 1) it return True if a value is Null in a column 2) .any() return True when atleast one of the elements is True.
null_columns = df_train.isnull().any()
df_train_null = df_train.loc[:,null_columns]
Count null values¶
null_counts = df_train_null.isnull().sum().sort_values(ascending=False)
print(null_counts / df_train.shape[0])
Drop columns with too many Missing Values¶
Delete Columns with more or equal 20% of Missing Values
percentage_null = null_counts / df_train.shape[0] useless_cols = [] for index, el in enumerate(percentage_null): if el >= 0.20: useless_cols.append(percentage_null.index[index])
df_train.drop(useless_cols, axis = 1, inplace = True)
print("Number of Columns Deleted Train: " + str(len(useless_cols)))
Test File¶
null_columns = df_test.isnull().any() df_test_null = df_test.loc[:,null_columns]
null_counts = df_test_null.isnull().sum() percentage_null = null_counts / df_test.shape[0] useless_cols = [] for index, el in enumerate(percentage_null): if el >= 0.20: useless_cols.append(percentage_null.index[index])
df_test.drop(useless_cols, axis = 1, inplace = True)
print("Number of Columns Deleted Test: " + str(len(useless_cols)))
Remove single columns value¶
drop_columns = []
for col in df_train.columns:
# unique also counts the Pandas missing value object nan as a value, we then should use dropna before
length = len(df_train[col].dropna().unique())
if length == 1:
drop_columns.append(col)
print(drop_columns)
##### Test File
for col in df_test.columns:
# unique also counts the Pandas missing value object nan as a value, we then should use dropna before
length = len(df_test[col].dropna().unique())
if length == 1:
drop_columns.append(col)
df_test.drop(drop_columns, axis=1, inplace=True)
print(drop_columns)
There is no columns with a single value
Columns Type¶
print(df_train.dtypes.value_counts())
print("\n" + str(df_test.dtypes.value_counts()))
Replace missing values with the column mean¶
# Select Int and Float columns
int_float_df = df_train.select_dtypes(include=['int64','float64'])
# fill missing values with mean column values
df_train.fillna(int_float_df.mean(), inplace=True)
# Check if there is Null Values in columns
print(sum(df_train.isnull().sum() != 0))
##### Test File
int_float_df = df_test.select_dtypes(include=['int64','float64'])
df_test.fillna(int_float_df.mean(), inplace=True)
print(sum(df_test.isnull().sum() != 0))
#Replace Product_Type NA values with the Mode.
df_test["product_type"].fillna(df_test["product_type"].mode().iloc[0], inplace=True)
print(sum(df_test.isnull().sum() != 0))
df_train.to_csv('./clean_train.csv', index=False)
df_test.to_csv('./clean_test.csv', index=False)
2) Data Exploration & Feature Engineering¶
df_train = pd.read_csv('./clean_train.csv')
df_test = pd.read_csv('./clean_test.csv')
Exploring the Realty Prices¶
fig, ax = plt.subplots()
plt.figsize=(8, 16)
ax.ticklabel_format(useOffset=False, style='plain')
print ("Skew is:", df_train.price_doc.skew())
ax.hist(df_train.price_doc, bins=100, color='blue')
plt.xlim(0, 40000000)
plt.show()
The distribution of Prices is positively skewed, so we need to perform a Data Transformation on the target variable with a log transform.
The goal is to increase the linear relationship between variables.
We'll need to transform back to the original scale the predictions using the inverse of the transformation that was applied to the data.
target = np.log(df_train.price_doc)
print ("Skew is:", target.skew())
plt.hist(target, bins=100, color='blue')
plt.show()
# Change Scientific Notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)
#df_train.price_doc.describe()
target = np.log(df_train.price_doc)
target.describe()
Remove outliers in Price (Higher than 3 std)¶
print(df_train.shape)
df_train = df_train[np.abs(np.log(df_train.price_doc)-np.log(df_train.price_doc).mean())<=(3.0*np.log(df_train.price_doc).std())]
print(df_train.shape)
Examine Correlations between Features and the Target Variable.¶
numeric_features = df_train.select_dtypes(include=[np.number])
corr = numeric_features.corr()
print (corr['price_doc'].sort_values(ascending=False)[100:150], '\n')
Select Features with a Correlation above 0.0¶
columns_corr = list(corr['price_doc'][corr['price_doc'] < 0.00].index)
#keep special features
#columns_corr.remove("kitch_sq")
#floor max_floor kitch_sq full_sq
df_train = df_train.drop(columns_corr, axis = 1)
print(df_train.columns)
print(len(df_train.columns))
# Test File
df_test = df_test.drop(columns_corr, axis = 1)
print(df_test.columns)
print(len(df_test.columns))
Plot Correlation Between Price and Total Area¶
target = np.log(df_train.price_doc)
plt.scatter(x=df_train['full_sq'], y=target)
plt.xlim(0, 250)
plt.ylabel('Sale Price')
plt.xlabel('Total Area in square meters')
plt.show()
We can notice some Outliers, we need to remove them.
df_train = df_train[df_train['full_sq'] < 250]
df_train.shape
Explore categorical columns¶
object_columns_df = df_train.select_dtypes(include=['object'])
object_columns_df.head(1)
# We need to skip the Date Column
filter_object_columns_df = list(object_columns_df.columns)[1:len(list(object_columns_df.columns))]
for column in filter_object_columns_df:
print(df_train[column].value_counts())
Select Top 50 Areas, and replace other areas with the label "Other". It will reduce the number of columns in the construction of Dummy Variables¶
less_frequent_area = object_columns_df.sub_area.value_counts()[50:].index
df_train.sub_area.replace(less_frequent_area, "Other", inplace=True)
Remove "Poselenie Klenovskoe" which is not in test set¶
print(df_train.shape) df_train = df_train[df_train.sub_area != "Poselenie Klenovskoe"] print(df_train.shape)
Creation of Dummy Variables¶
# Remove sub Area with too many different values
filter_object_columns_df.remove("sub_area")
dummy_df = pd.get_dummies(df_train[filter_object_columns_df])
df_train = pd.concat([df_train, dummy_df], axis=1)
filter_object_columns_df.append("sub_area")
df_train = df_train.drop(filter_object_columns_df, axis=1)
##### Test File
object_columns_df = df_test.select_dtypes(include=['object'])
filter_object_columns_df = list(object_columns_df.columns)[1:len(list(object_columns_df.columns))]
# Remove sub Area with too many different values
filter_object_columns_df.remove("sub_area")
dummy_df_test = pd.get_dummies(df_test[filter_object_columns_df])
df_test = pd.concat([df_test, dummy_df_test], axis=1)
filter_object_columns_df.append("sub_area")
df_test = df_test.drop(filter_object_columns_df, axis=1)
#Train
df_train["timestamp"] = pd.to_datetime(df_train["timestamp"])
month_year = (df_train.timestamp.dt.month + df_train.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
#map month_year with value counts
df_train['month_year_count'] = month_year.map(month_year_cnt_map)
month_year = (df_train.timestamp.dt.weekofyear + df_train.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_train['week_year_count'] = month_year.map(month_year_cnt_map)
#Test
df_test["timestamp"] = pd.to_datetime(df_test["timestamp"])
month_year = (df_test.timestamp.dt.month + df_test.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_test['month_year_count'] = month_year.map(month_year_cnt_map)
month_year = (df_test.timestamp.dt.weekofyear + df_test.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_test['week_year_count'] = month_year.map(month_year_cnt_map)
Add month and day-of-week¶
#Train
df_train['month'] = df_train["timestamp"].dt.month
df_train['day'] = df_train["timestamp"].dt.dayofweek
#Test
df_test['month'] = df_test["timestamp"].dt.month
df_test['day'] = df_test["timestamp"].dt.dayofweek
Other Feature Engineering¶
df_train['rel_floor'] = df_train['floor'] / df_train['max_floor'].astype(float)
df_train['rel_kitch_sq'] = df_train['kitch_sq'] / df_train['full_sq'].astype(float)
df_test['rel_floor'] = df_test['floor'] / df_test['max_floor'].astype(float)
df_test['rel_kitch_sq'] = df_test['kitch_sq'] / df_test['full_sq'].astype(float)
Check if there is no Null Values.¶
print(sum(df_train.isnull().sum() != 0))
print(sum(df_test.isnull().sum() != 0))
Replace by 0 when we divide by 0¶
df_train["rel_floor"].fillna(0, inplace=True)
df_train["rel_kitch_sq"].fillna(0, inplace=True)
df_test["rel_floor"].fillna(0, inplace=True)
df_test["rel_kitch_sq"].fillna(0, inplace=True)
print(sum(df_train.isnull().sum() != 0))
print(sum(df_test.isnull().sum() != 0))
df_train.to_csv('./final_train.csv', index=False)
df_test.to_csv('./final_test.csv', index=False)
4) Making Predictions¶
df_train = pd.read_csv('./final_train.csv')
df_test = pd.read_csv('./final_test.csv')
X = df_train.drop(['price_doc', 'timestamp', 'id'], axis=1)
Y = np.log(df_train.price_doc)
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from math import sqrt
Split Train and Test (80/20)¶
from sklearn.model_selection import train_test_split
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, random_state=42, test_size=.20)
Cross Validation : Model Tuning¶
params_tuning = {
'max_depth': [6],
'min_child_weight': [5],
'colsample_bytree': [0.8],
'subsample': [0.7, 0.8],
'learning_rate': [0.07]
}
xgb_params = {
'eta': 0.05,
'max_depth': 6,
'min_child_weight': 5,
#'subsample': 0.7,
#'colsample_bytree': 0.7,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': 1,
'n_estimators': 100
}
gridsearch_xgb = GridSearchCV(xgb.XGBRegressor(xgb_params), params_tuning, cv = 3)
gridsearch_xgb.fit(X, Y)
gridsearch_xgb.grid_scores_
gridsearch_xgb.best_params_
gridsearch_xgb.best_score_
y_predicted = gridsearch_xgb.best_estimator_.predict(X_validation)
RMSE = sqrt(mean_squared_error(Y_validation, y_predicted))
print(RMSE)
Best Score : 0.40898454506531656
Boost Iteration¶
df_columns = X.columns
dfinal = xgb.DMatrix(X, Y, feature_names=df_columns)
dtrain = xgb.DMatrix(X_train, Y_train, feature_names=df_columns)
dvalidation = xgb.DMatrix(X_validation, Y_validation, feature_names=df_columns)
xgb_params = {
'eta': 0.07,
'max_depth': 6,
'min_child_weight': 5,
'subsample': 0.8,
'colsample_bytree': 0.8,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': 1
}
model_xgb = xgb.train(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
evals=[(dvalidation, 'validation')], verbose_eval=10)
num_boost_round = model_xgb.best_iteration
#watchlist = [(dtrain, 'train'), (dvalidation, 'eval')]
final_model_xgb = xgb.train(xgb_params, dfinal, num_boost_round = num_boost_round)
fig, ax = plt.subplots(1, 1, figsize=(8, 10))
xgb.plot_importance(model_xgb, max_num_features=50, height=0.5, ax = ax)
Examine if there there is Overfitting¶
#Validation Set
y_predicted = final_model_xgb.predict(dvalidation)
RMSE = sqrt(mean_squared_error(Y_validation, y_predicted))
print(RMSE)
#Train Set
y_predicted = final_model_xgb.predict(dtrain)
RMSE = sqrt(mean_squared_error(Y_train, y_predicted))
print(RMSE)
Submission File¶
#Get the Id for the submission file
id_test = df_test.id
df_test = df_test.drop(['timestamp','id'], axis=1)
dtest = xgb.DMatrix(df_test)
Y_pred = final_model_xgb.predict(dtest)
#Transform the Prediction to the correct form : we reverse Log() with Exp()
final_predictions = np.exp(Y_pred)
print ("Original predictions are: \n", Y_pred[:5], "\n")
print ("Final predictions are: \n", final_predictions[:5])
df_submission = pd.DataFrame({'id': id_test, 'price_doc': final_predictions})
df_submission.to_csv('submission.csv', index=False)